Ensembl ID ENSG00000187951 had an incorrect gene name assigned to it, so in the newest preprocessing pipeline it was corrected and removed during Filtering (since it was a lncRNA), but removing this gene from the expression dataset seems to have caused bigger changes in the resulting dataset that expected.

Gene ENSG00000187951

# Load raw data
datExpr = read.csv('./../Data/RNAseq_ASD_datExpr.csv', row.names=1)
datMeta = read.csv('./../Data/RNAseq_ASD_datMeta.csv')

datMeta = datMeta %>% mutate(Brain_Region = as.factor(Region)) %>% 
                      mutate(Brain_lobe = ifelse(Brain_Region %in% c('BA4_6', 'BA9', 'BA24', 'BA44_45'), 'Frontal',
                                          ifelse(Brain_Region %in% c('BA3_1_2_5', 'BA7'), 'Parietal',
                                          ifelse(Brain_Region %in% c('BA38', 'BA39_40', 'BA20_37', 'BA41_42_22'), 'Temporal',
                                          'Occipital')))) %>%
                      mutate(Batch = as.factor(gsub('/', '.', RNAExtractionBatch)), 
                             Diagnosis = factor(Diagnosis_, levels=c('CTL','ASD'))) %>% 
                      dplyr::select(-Diagnosis_)

# Filter to only keep Frontal Lobe samples
datMeta = datMeta %>% filter(Brain_lobe=='Frontal')
datExpr = datExpr %>% dplyr::select(which(colnames(datExpr) %in% paste0('X',gsub('-','_',datMeta$X))))

The gene has quite a high standard deviation given its mean expression value

plot_data = data.frame('ID' = rownames(datExpr), 'Mean' = rowMeans(datExpr)+1, 'SD' = apply(datExpr, 1, sd)+1, 
                       'is_ENSG00000187951' = rownames(datExpr) == 'ENSG00000187951',
                       alpha = ifelse(rownames(datExpr) == 'ENSG00000187951', 1, 0.1),
                       color = ifelse(rownames(datExpr) == 'ENSG00000187951', '#006699', '#808080'))

plot_data %>% ggplot(aes(Mean, SD)) + geom_point(alpha = plot_data$alpha, color = plot_data$color) +
              geom_abline(color='gray') + scale_x_log10() + scale_y_log10() + theme_minimal()

This high variance is caused by a single Sample, which has a very high and uncharacteristic level of expression when comparing it to the level of expression this gene has in the other samples

plot_data = data.frame('ID' = colnames(datExpr), 'meanExpr' = colMeans(datExpr), 'geneExpr' = t(datExpr[rownames(datExpr) == 'ENSG00000187951',]), 'Diagnosis' = datMeta$Diagnosis)

summary(plot_data$ENSG00000187951)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    7.00   17.25   24.50   87.41   28.75 1460.00
ggplotly(plot_data %>% ggplot(aes(meanExpr, ENSG00000187951, color=Diagnosis)) + geom_point(alpha=0.8) +
         theme_minimal() + xlab('Mean expression by Sample') + ylab('Expression of ENSG00000187951 by Sample'))

Effects in vst transformation

This may be causing changes in the resulting level of expression of the genes when performing the vst transformation to stabilise variance in the dataset

The difference is quite small, but it seems to have linearly lowered the mean level of expression of the genes, affecting the genes with the lowest levels of expression the most

# Original preprocessing
load('./../Data/preprocessed_data_old.RData')
datExpr_before = datExpr %>% data.frame %>% dplyr::select(which(colnames(datExpr) %in% paste0('X',gsub('-','_',datMeta$X))))

# New preprocessing
load('./../Data/preprocessed_data.RData')
datExpr_now = datExpr %>% data.frame

plot_data = data.frame('MeanExpr_before' = rowMeans(datExpr_before)[rownames(datExpr_before)!='ENSG00000187951'],
                       'MeanExpr_after' = rowMeans(datExpr_now)) %>%
            mutate(Diff = MeanExpr_before - MeanExpr_after)

summary(plot_data$Diff)
##       Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
## -2.051e-02 -7.520e-03 -2.843e-03 -5.210e-03 -1.164e-03 -1.114e-05
plot_data %>% ggplot(aes(MeanExpr_before, MeanExpr_after, color=Diff)) + geom_point(alpha=0.1) +
              geom_abline(color='gray') + xlab('Mean Level of Expression Before') + 
              ylab('Mean Level of Expression Now') + coord_fixed() + scale_color_viridis() + 
              theme_minimal()

rm(datExpr, plot_data)

Effects in WGCNA Top Modules

getModuleDiagCor = function(dataset, datExpr){
  datTraits = datMeta %>% dplyr::select(Diagnosis, Sex, Age, PMI, RNAExtractionBatch) %>%
            dplyr::rename('ExtractionBatch' = RNAExtractionBatch)
  
  ME_object = datExpr %>% t %>% moduleEigengenes(colors = dataset$Module)
  MEs = orderMEs(ME_object$eigengenes)
  
  moduleTraitCor = MEs %>% apply(2, function(x) hetcor(x, datTraits)$correlations[1,-1]) %>% t
  moduleDiagCor = moduleTraitCor[,1]
  
  if(sum(!complete.cases(moduleDiagCor))>0){
    for(m in names(moduleDiagCor)){
      if(is.na(moduleDiagCor[m]))
        moduleDiagCor[m] = polyserial(MEs[,m], datMeta$Diagnosis)
    }
  }
  return(moduleDiagCor)
}

clustering_selected = 'DynamicHybridMergedSmall'

# Before
dataset_before = read.csv(paste0('./../Data/dataset_', clustering_selected, '_old.csv'))
dataset_before$Module = dataset_before[,clustering_selected]
moduleDiagCor_before = getModuleDiagCor(dataset_before, datExpr_before)

# Now
dataset_now = read.csv(paste0('./../Data/dataset_', clustering_selected, '.csv'))
dataset_now$Module = dataset_now[,clustering_selected]
moduleDiagCor_now = getModuleDiagCor(dataset_now, datExpr_now)

rm(getModuleDiagCor, clustering_selected)
module = dataset_before$Module[dataset_before$ID=='ENSG00000187951']
cat(paste0('Gene ENSG00000187951 belonged to Module ', module, ' which had a Module-Diagnosis correlation of ',
           round(moduleDiagCor_before[module][[1]],2)))
## Gene ENSG00000187951 belonged to Module #00BF7A which had a Module-Diagnosis correlation of -0.51
rm(module)
top_modules_before = gsub('ME','',names(moduleDiagCor_before)[abs(moduleDiagCor_before)>0.9])
top_modules_now = gsub('ME','',names(moduleDiagCor_now)[abs(moduleDiagCor_now)>0.9])

genes_top_modules_memberships = data.frame('ID' = rownames(datExpr_now))
for(tm in top_modules_before){
  genes_top_modules_memberships[,tm] = dataset_before$Module[dataset_before$ID!='ENSG00000187951']==tm
}
for(tm in top_modules_now){
  genes_top_modules_memberships[,tm] = dataset_now$Module==tm
}

cat('Top Modules sizes before:')
## Top Modules sizes before:
colSums(genes_top_modules_memberships[,1+(1:length(top_modules_before))])
## #FE6E8A #7DAE00 #E08B00 
##      32     149     223
cat('Top Modules sizes now:')
## Top Modules sizes now:
colSums(genes_top_modules_memberships[,(2+length(top_modules_before)):ncol(genes_top_modules_memberships)])
## #D873FC #D69100 
##      23     891

Modules with positive correlation to Diagnosis

pos_corr = genes_top_modules_memberships %>%
           dplyr::select(c(gsub('ME','',names(moduleDiagCor_before)[moduleDiagCor_before>0.9]),
                           gsub('ME','',names(moduleDiagCor_now)[moduleDiagCor_now>0.9])))

grid.newpage()
grid.draw(draw.triple.venn(sum(pos_corr[,1]), sum(pos_corr[,2]), sum(pos_corr[,3]),
                           sum(pos_corr[,1]*pos_corr[,2]), sum(pos_corr[,2]*pos_corr[,3]), sum(pos_corr[,1]*pos_corr[,3]),
                           sum(pos_corr[,1]*pos_corr[,2]*pos_corr[,3]),
          category = paste0(colnames(pos_corr), c(' (Before)', ' (Before)', ' (Now)')),
          fill = colnames(pos_corr), fontfamily = rep('sans-serif',7),
          alpha = rep(0.25,3), lty = rep('blank', 3), cat.fontfamily = rep('sans-serif',3)))

cat(paste0('We recover ', sum(pos_corr[,1] + pos_corr[,2] & pos_corr[,3]>0), '/',
           sum(pos_corr[,1]+pos_corr[,2]), ' of the genes found in the original Module (',
           round(100*sum(pos_corr[,1] + pos_corr[,2] & pos_corr[,3]>0)/(sum(pos_corr[,1]+pos_corr[,2]))),'%)'))
## We recover 206/372 of the genes found in the original Module (55%)

Modules with negative correlation to Diagnosis

neg_corr = genes_top_modules_memberships %>%
           dplyr::select(c(gsub('ME','',names(moduleDiagCor_before)[moduleDiagCor_before < -0.9]),
                           gsub('ME','',names(moduleDiagCor_now)[moduleDiagCor_now < -0.9])))

grid.newpage()
grid.draw(draw.pairwise.venn(sum(neg_corr[,1]), sum(neg_corr[,2]),sum(neg_corr[,1]*neg_corr[,2]),
          category = paste0(colnames(neg_corr), c(' (Before)', ' (Now)')),
          fill = colnames(neg_corr), fontfamily = rep('sans-serif',3),
          alpha = rep(0.25,2), lty = rep('blank', 2), cat.fontfamily = rep('sans-serif',2)))

cat(paste0('We recover ', sum(neg_corr[,1] & neg_corr[,2]>0), '/',
           sum(neg_corr[,1]), ' of the genes found in the original Module (',
           round(100*sum(neg_corr[,1] & neg_corr[,2]>0)/sum(neg_corr[,1])),'%)'))
## We recover 21/32 of the genes found in the original Module (66%)
# Three Modules
# grid.newpage()
# grid.draw(draw.triple.venn(sum(neg_corr[,1]), sum(neg_corr[,2]), sum(neg_corr[,3]),
#                            sum(neg_corr[,1]*neg_corr[,2]), sum(neg_corr[,2]*neg_corr[,3]), sum(neg_corr[,1]*neg_corr[,3]),
#                            sum(neg_corr[,1]*neg_corr[,2]*neg_corr[,3]),
#           category = paste0(colnames(neg_corr), c(' (Before)', ' (Now)', ' (Now)')),
#           fill = colnames(neg_corr), fontfamily = rep('sans-serif',7),
#           alpha = rep(0.25,3), lty = rep('blank', 3), cat.fontfamily = rep('sans-serif',3)))
# 
# cat(paste0('We recover ', sum(neg_corr[,1] & neg_corr[,2]+neg_corr[,3]>0), '/',
#            sum(neg_corr[,1]), ' of the genes found in the original Module (',
#            round(100*sum(neg_corr[,1] & neg_corr[,2]+neg_corr[,3]>0)/sum(neg_corr[,1])),'%)'))